import pymongo
# MongoDB Connections
client = pymongo.MongoClient('mongodb://localhost:27017')
db = client['MLB_DB']
def getStatAvg(stats_list, projections_str, player_projected_stats_doc):
stat_averages = [0] * len(stats_list)
count = 0
for player in player_projected_stats_doc[projections_str].values():
stat_index = 0
for stat in stats_list:
try:
stat_val = float(player[stat])
except:
stat_val = 0
stat_averages[stat_index] = stat_averages[stat_index] + stat_val
stat_index = stat_index + 1
count = count + 1
stat_averages = [i/count for i in stat_averages]
return stat_averages
import datahelp
def getData(summaryDB, PlayerDataDB, PlayerProjectionsDB):
columns = datahelp.columns
data = []
for document in summaryDB.find():
game_id = document['game_id_num']
for player_data in PlayerDataDB.find({'game_id_num': str(game_id)}):
home_umpire = [player_data['umpires']['home']['id']]
for player_projections in PlayerProjectionsDB.find({'game_id_num': str(game_id)}):
# Get Stat averages
home_batting_avg = getStatAvg(datahelp.batting_stats, 'home_batting_projections', player_projections)
away_batting_avg = getStatAvg(datahelp.batting_stats, 'away_batting_projections', player_projections)
home_pitching = getStatAvg(datahelp.pitching_stats, 'home_pitcher_projections', player_projections)
away_pitching = getStatAvg(datahelp.pitching_stats, 'away_pitcher_projections', player_projections)
home_win = [1 if document['home_score']>document['away_score'] else 0]
data_row = home_batting_avg + away_batting_avg + home_pitching + away_pitching + home_umpire + home_win
data.append(data_row)
return data
gameSummaryDB = db['GameSummary']
gamePlayerDataDB = db['GamePlayerData']
playerProjectionsDB = db['PlayerProjectedStats']
data = getData(gameSummaryDB, gamePlayerDataDB, playerProjectionsDB)
import numpy as np
import pandas as pd
import datahelp
columns = datahelp.columns
data_df = pd.DataFrame(np.array(data), columns=columns)
print("Data count (rows, columns) : " + str(data_df.shape))
data_df.head()
Data count (rows, columns) : (1202, 98)
| home hr avg | home gidp avg | home sac avg | home rbi avg | home tb avg | home slg avg | home avg avg | home bb avg | home ops avg | home hbp avg | ... | away pitching ip | away pitching w | away pitching r | away pitching pa | away pitching cg | away pitching gs | away pitching ibb | away pitching er | umpire_id | home win | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 15.222222 | 9.444444 | 4.000000 | 80.666667 | 283.111111 | 0.393667 | 0.259111 | 60.111111 | 0.715444 | 10.888889 | ... | 142.1 | 10.0 | 65.0 | 617.0 | 0.0 | 11.0 | 1.0 | 58.0 | 484499.0 | 0.0 |
| 1 | 16.777778 | 8.333333 | 3.888889 | 77.444444 | 273.888889 | 0.384889 | 0.250778 | 59.555556 | 0.701222 | 14.666667 | ... | 57.2 | 1.0 | 36.0 | 265.0 | 0.0 | 10.0 | 1.0 | 35.0 | 482666.0 | 0.0 |
| 2 | 16.333333 | 7.777778 | 2.777778 | 63.222222 | 214.555556 | 0.377667 | 0.230000 | 45.111111 | 0.673111 | 10.888889 | ... | 191.2 | 7.0 | 138.0 | 869.0 | 0.0 | 28.0 | 1.0 | 130.0 | 427315.0 | 0.0 |
| 3 | 18.555556 | 8.888889 | 3.222222 | 78.222222 | 265.444444 | 0.406556 | 0.254111 | 52.666667 | 0.722778 | 12.666667 | ... | 388.2 | 22.0 | 185.0 | 1654.0 | 0.0 | 64.0 | 2.0 | 168.0 | 428442.0 | 1.0 |
| 4 | 17.555556 | 9.888889 | 3.222222 | 81.444444 | 280.888889 | 0.409444 | 0.262444 | 55.222222 | 0.728667 | 9.222222 | ... | 255.1 | 12.0 | 149.0 | 1143.0 | 0.0 | 46.0 | 2.0 | 138.0 | 427044.0 | 1.0 |
5 rows × 98 columns
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
X = data_df.drop(['home win'], axis = 1)
y = data_df['home win']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 15)
# visualize number of digits classes
plt.figure(figsize=(5,5))
g = sns.countplot(y_train, palette="icefire")
plt.title("Number of Wins and Losses")
y_train.value_counts()
X_train.head()
C:\Users\vin99\AppData\Local\Programs\Python\Python38\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
| home hr avg | home gidp avg | home sac avg | home rbi avg | home tb avg | home slg avg | home avg avg | home bb avg | home ops avg | home hbp avg | ... | away pitching h | away pitching ip | away pitching w | away pitching r | away pitching pa | away pitching cg | away pitching gs | away pitching ibb | away pitching er | umpire_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 976 | 22.222222 | 8.777778 | 2.444444 | 83.444444 | 273.555556 | 0.388667 | 0.241889 | 60.000000 | 0.693444 | 5.888889 | ... | 85.0 | 91.2 | 5.0 | 43.0 | 377.0 | 0.0 | 13.0 | 0.0 | 42.0 | 521251.0 |
| 136 | 37.666667 | 8.666667 | 2.555556 | 113.777778 | 357.444444 | 0.483111 | 0.264889 | 78.000000 | 0.813000 | 6.777778 | ... | 34.0 | 35.2 | 3.0 | 23.0 | 162.0 | 0.0 | 7.0 | 0.0 | 21.0 | 503493.0 |
| 1092 | 11.000000 | 5.666667 | 0.444444 | 35.666667 | 117.444444 | 0.371222 | 0.220444 | 29.111111 | 0.663444 | 4.333333 | ... | 43.0 | 51.2 | 3.0 | 25.0 | 215.0 | 0.0 | 10.0 | 0.0 | 20.0 | 427139.0 |
| 981 | 28.000000 | 6.222222 | 2.000000 | 100.222222 | 313.000000 | 0.442778 | 0.253667 | 68.777778 | 0.770778 | 8.666667 | ... | 252.0 | 233.0 | 11.0 | 155.0 | 1024.0 | 0.0 | 41.0 | 0.0 | 148.0 | 547380.0 |
| 620 | 33.777778 | 13.777778 | 3.666667 | 129.000000 | 393.555556 | 0.434333 | 0.262778 | 93.555556 | 0.770889 | 10.555556 | ... | 214.0 | 209.2 | 10.0 | 116.0 | 912.0 | 0.0 | 38.0 | 1.0 | 109.0 | 427128.0 |
5 rows × 97 columns
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
mlp = MLPClassifier(hidden_layer_sizes=(50,75,98), max_iter=2000)
mlp.fit(X_train,y_train)
y_predict = mlp.predict(X_test)
cm = np.array(confusion_matrix(y_test, y_predict, labels=[1,0]))
confusion = pd.DataFrame(cm, index=['home_win', 'home_loss'],
columns=['predicted_win','predicted_loss'])
sns.heatmap(confusion, annot=True)
<AxesSubplot:>
from sklearn.svm import SVC
svc_model = SVC()
svc_model.fit(X_train, y_train)
y_predict = svc_model.predict(X_test)
cm = np.array(confusion_matrix(y_test, y_predict, labels=[1,0]))
confusion = pd.DataFrame(cm, index=['home_win', 'home_loss'],
columns=['predicted_win','predicted_loss'])
sns.heatmap(confusion, annot=True)
<AxesSubplot:>
import shap
from sklearn.svm import SVR
svr_poly = SVR(kernel='poly', C=1e3, degree=2)
svr_poly.fit(X_train, y_train)
y_predict = svr_poly.predict(X_test)
explainer = shap.explainers.Permutation(svr_poly.predict, X_test)
shap_values_svc = explainer(X_test[:100], silent=True)
shap.plots.bar(shap_values_svc)
from sklearn.neural_network import MLPRegressor
mlp_regr = MLPRegressor(random_state=1, max_iter=2000).fit(X_train, y_train)
y_predict = mlp_regr.predict(X_test)
explainer = shap.explainers.Permutation(mlp_regr.predict, X_test)
shap_values_svc = explainer(X_test[:100], silent=True)
shap.plots.bar(shap_values_svc)
from sklearn import tree
from dtreeviz.trees import *
import dtreeviz as dtviz
max_depth = 10
clf_dt = tree.DecisionTreeClassifier(max_depth=max_depth)
clf_dt = clf_dt.fit(X_train, y_train)
#plt.figure(figsize=(35,12))
#tree.plot_tree(clf_dt,feature_names=columns,class_names=['loss','win'],rounded=True,fontsize=8);
#plt.show()
predictions_dt = clf_dt.predict(X_test)
accuracy = 1-(((y_test != predictions_dt).sum())/len(X_test))
print("Decision Tree Classification (max depth =" + str(max_depth) + ")")
print("Number of mislabeled points out of a total %d points: %d" % (len(X_test), (y_test != predictions_dt).sum()))
print("Accuracy: %f" % (accuracy))
viz = dtviz.trees.dtreeviz(clf_dt,
X_train,
y_train,
target_name='home_win',
feature_names=columns[:-1],
orientation="LR",
class_names=['loss','win'],
fancy=True,
X=None,
label_fontsize=12,
ticks_fontsize=8,
fontname="Arial")
viz
Decision Tree Classification (max depth =10) Number of mislabeled points out of a total 181 points: 59 Accuracy: 0.674033
clf_dt = tree.DecisionTreeClassifier()
clf_dt = clf_dt.fit(X_train, y_train)
#plt.figure(figsize=(100,15))
#tree.plot_tree(clf_dt,feature_names=columns,class_names=['loss','win'],rounded=True,fontsize=7);
#plt.savefig('dtree_high_dpi',dpi=600)
predictions_dt = clf_dt.predict(X_test)
accuracy = 1-(((y_test != predictions_dt).sum())/len(X_test))
print("Decision Tree Classification")
print("Number of mislabeled points out of a total %d points: %d" % (len(X_test), (y_test != predictions_dt).sum()))
print("Accuracy: %f" % (accuracy))
Decision Tree Classification Number of mislabeled points out of a total 181 points: 69 Accuracy: 0.618785
from sklearn.ensemble import RandomForestClassifier
max_depth = 7
criterion = "entropy"
clf_rf = RandomForestClassifier(max_depth=7, random_state=0, criterion="entropy")
clf_rf.fit(X_train, y_train)
predictions_rf = clf_rf.predict(X_test)
accuracy = 1-(((y_test != predictions_rf).sum())/len(X_test))
print("Random Forest Classification")
print("max depth = " + str(max_depth) + ", criterion = " + criterion)
print("Number of mislabeled points out of a total %d points: %d" % (len(X_test), (y_test != predictions_rf).sum()))
print("Accuracy: %f" % (accuracy))
estimator = clf_rf.estimators_[5]
viz = dtviz.trees.dtreeviz(estimator,
X_train,
y_train,
target_name='home_win',
feature_names=columns[:-1],
orientation="LR",
class_names=['loss','win'],
fancy=True,
X=None,
label_fontsize=12,
ticks_fontsize=8,
fontname="Arial")
viz
Random Forest Classification max depth = 7, criterion = entropy Number of mislabeled points out of a total 181 points: 52 Accuracy: 0.712707
gameSummaryFutureDB = db['GameSummaryFutureGames']
gamePlayerDataFutureDB = db['GamePlayerDataFutureGames']
playerProjectionsFutureDB = db['PlayerProjectedStatsFutureGames']
future_data = getData(gameSummaryFutureDB, gamePlayerDataFutureDB, playerProjectionsFutureDB)
future_data_df = pd.DataFrame(np.array(future_data), columns=columns)
print("Data count (rows, columns) : " + str(future_data_df.shape))
y_test_future = future_data_df['home win']
future_data_df = future_data_df.drop(['home win'], axis = 1)
future_data_df
Data count (rows, columns) : (9, 98)
| home hr avg | home gidp avg | home sac avg | home rbi avg | home tb avg | home slg avg | home avg avg | home bb avg | home ops avg | home hbp avg | ... | away pitching h | away pitching ip | away pitching w | away pitching r | away pitching pa | away pitching cg | away pitching gs | away pitching ibb | away pitching er | umpire_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10.222222 | 2.666667 | 0.444444 | 40.555556 | 111.444444 | 0.409000 | 0.236111 | 28.333333 | 0.724556 | 3.777778 | ... | 129.0 | 103.1 | 6.0 | 76.0 | 474.0 | 0.0 | 23.0 | 0.0 | 70.0 | 482641.0 |
| 1 | 11.777778 | 5.777778 | 0.111111 | 33.666667 | 103.888889 | 0.379222 | 0.215889 | 34.333333 | 0.683222 | 2.888889 | ... | 49.0 | 55.1 | 5.0 | 28.0 | 235.0 | 0.0 | 6.0 | 1.0 | 24.0 | 608093.0 |
| 2 | 11.111111 | 5.333333 | 0.333333 | 37.111111 | 118.222222 | 0.476444 | 0.265444 | 26.555556 | 0.809000 | 3.444444 | ... | 126.0 | 131.0 | 9.0 | 64.0 | 553.0 | 0.0 | 23.0 | 0.0 | 58.0 | 427292.0 |
| 3 | 10.333333 | 6.333333 | 0.888889 | 39.444444 | 125.666667 | 0.381889 | 0.247667 | 26.666667 | 0.686556 | 3.000000 | ... | 59.0 | 68.2 | 5.0 | 37.0 | 285.0 | 0.0 | 15.0 | 0.0 | 35.0 | 607884.0 |
| 4 | 10.333333 | 5.555556 | 0.444444 | 33.333333 | 107.555556 | 0.403556 | 0.228778 | 28.555556 | 0.709222 | 2.333333 | ... | 87.0 | 97.2 | 3.0 | 37.0 | 410.0 | 0.0 | 14.0 | 0.0 | 34.0 | 483630.0 |
| 5 | 6.444444 | 3.000000 | 0.111111 | 24.111111 | 72.888889 | 0.390222 | 0.249556 | 14.666667 | 0.692222 | 1.888889 | ... | 23.0 | 27.1 | 1.0 | 14.0 | 116.0 | 0.0 | 5.0 | 0.0 | 13.0 | 521889.0 |
| 6 | 13.555556 | 6.555556 | 0.333333 | 45.333333 | 130.888889 | 0.431000 | 0.244111 | 29.000000 | 0.754000 | 6.777778 | ... | 12.0 | 18.1 | 2.0 | 7.0 | 77.0 | 0.0 | 1.0 | 2.0 | 6.0 | 503490.0 |
| 7 | 11.555556 | 6.555556 | 0.666667 | 36.000000 | 115.555556 | 0.453000 | 0.268889 | 28.888889 | 0.800889 | 2.666667 | ... | 92.0 | 113.1 | 6.0 | 52.0 | 468.0 | 0.0 | 22.0 | 0.0 | 51.0 | 427248.0 |
| 8 | 9.888889 | 5.444444 | 1.444444 | 37.777778 | 113.222222 | 0.398667 | 0.240778 | 26.111111 | 0.705000 | 4.222222 | ... | 73.0 | 74.1 | 4.0 | 39.0 | 311.0 | 0.0 | 15.0 | 1.0 | 39.0 | 503077.0 |
9 rows × 97 columns
future_predictions = clf_dt.predict(future_data_df)
count = 0
for document in gameSummaryFutureDB.find():
summary_string = document['away_name'] + ' @ ' + document['home_name'] + " [" + document['game_datetime'] + "]"
if future_predictions[count] == 1:
print(summary_string)
print(" winner: " + document['home_name'])
#print(" winner: " + document['home_name'] + " [proba: " + str(round(future_predictions[count],4)) + "]")
else:
print(summary_string)
print(" winner: " + document['away_name'])
#print(" winner: " + document['away_name'] + " [proba: " + str(round(future_predictions[count],4)) + "]")
count = count + 1
#accuracy = 1-(((y_test_future != future_predictions).sum())/len(future_data_df))
#print("Decision Tree Classification")
#print("Number of mislabeled points out of a total %d points: %d" % ((len(future_data_df)), (y_test_future != future_predictions).sum()))
#print("Accuracy: %f" % (accuracy))
Baltimore Orioles @ Tampa Bay Rays [2021-08-16T23:10:00Z]
winner: Tampa Bay Rays
Los Angeles Angels @ New York Yankees [2021-08-16T23:05:00Z]
winner: New York Yankees
Oakland Athletics @ Chicago White Sox [2021-08-17T00:10:00Z]
winner: Chicago White Sox
Houston Astros @ Kansas City Royals [2021-08-17T00:10:00Z]
winner: Kansas City Royals
Cleveland Indians @ Minnesota Twins [2021-08-17T00:10:00Z]
winner: Minnesota Twins
Atlanta Braves @ Miami Marlins [2021-08-16T23:10:00Z]
winner: Miami Marlins
Chicago Cubs @ Cincinnati Reds [2021-08-16T23:10:00Z]
winner: Cincinnati Reds
New York Mets @ San Francisco Giants [2021-08-17T01:45:00Z]
winner: San Francisco Giants
San Diego Padres @ Colorado Rockies [2021-08-17T00:40:00Z]
winner: San Diego Padres